{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the old dataset:\n[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]\nmy test entropy should be 0.97095 :\n0.970950594455\nthe new dataset:\n[[1, 1, 'mabey'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]\nmy test entropy should be 1.37095 :\n1.37095059445\n"
     ]
    }
   ],
   "source": [
    "#决策树算法\n",
    "\n",
    "import math\n",
    "#小的数据集\n",
    "def createDataSet():\n",
    "    dataSet = [[1, 1, 'yes'],\n",
    "               [1, 1, 'yes'],\n",
    "               [1, 0, 'no'],\n",
    "               [0, 1, 'no'],\n",
    "               [0, 1, 'no']]\n",
    "    featureNames = ['no surfacing','flippers'] #不浮出水面是否存活 ，有无脚蹼\n",
    "    #change to discrete values\n",
    "    return dataSet, featureNames\n",
    "\n",
    "#计算信息熵,因为我们会利用最大信息增益的方法划分数据集-----看哪个特征划分使得，信息熵(数据无序度)减小的最多\n",
    "def Entropy(dataSet):\n",
    "    num = len(dataSet)\n",
    "    labelCounts = {}\n",
    "    for featVec in dataSet: # 统计每个类别的数量\n",
    "        currentLabel = featVec[-1] #最后1列为键\n",
    "        if currentLabel not in labelCounts.keys(): \n",
    "            labelCounts[currentLabel] = 0 #初始值=0\n",
    "        labelCounts[currentLabel] += 1 #统计+1\n",
    "    entropy = 0.0\n",
    "    for key in labelCounts:  #\n",
    "        prob = float(labelCounts[key])/num\n",
    "        entropy -= prob * math.log(prob,2) #log base 2\n",
    "    return entropy\n",
    "\n",
    "#测试一下\n",
    "myData,myFeatureNames = createDataSet()\n",
    "print \"the old dataset:\\n\",myData\n",
    "\n",
    "myEntropy = Entropy(myData)\n",
    "print \"my test entropy should be 0.97095 :\\n\",myEntropy\n",
    "\n",
    "#添加一类，mabey,yes,no   ====熵越高，表明数据越混乱\n",
    "myData[0][-1] = \"mabey\"\n",
    "print \"the new dataset:\\n\",myData\n",
    "myEntropy = Entropy(myData)\n",
    "print \"my test entropy should be 1.37095 :\\n\",myEntropy\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1, 'yes'], [1, 'yes'], [0, 'no']]\n"
     ]
    }
   ],
   "source": [
    "#划分数据集\n",
    "\n",
    "\n",
    "#按照给定的特征axis,根据他的取值value，划分数据集，返回新的数据集合，少了1个特征（划分依据的那个特征axis）\n",
    "\n",
    "def splitDataSet(dataSet, axis, value):\n",
    "    returnDataSet = []\n",
    "    for dataVec in dataSet:\n",
    "        if dataVec[axis] == value:\n",
    "            tempVec = dataVec[:axis]     #0--(axis-1)\n",
    "            tempVec.extend(dataVec[axis+1:]) #(axis+1)--(-1) #所以减去了axis\n",
    "            returnDataSet.append(tempVec) \n",
    "    return returnDataSet\n",
    "\n",
    "#测试一下\n",
    "myData,myFeatureNames = createDataSet()\n",
    "print splitDataSet(myData,0,1) #axis = 0,且这个特征的值=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "best feature shoule be 0\n0\n"
     ]
    }
   ],
   "source": [
    "#看哪个特征划分使得，信息熵(数据无序度)减小的最多\n",
    "\n",
    "def chooseBestFeatureToSplit(dataSet):\n",
    "    numFeatures = len(dataSet[0]) - 1      #最后1列是类别\n",
    "    baseEntropy = Entropy(dataSet)         #首先计算原始的信息熵\n",
    "    bestInfoGain = 0.0; bestFeature = -1\n",
    "    for i in range(numFeatures):        #迭代所有的特征\n",
    "        #将数据集中的第i个特征的值，放到一个list中\n",
    "        featureList = [example[i] for example in dataSet]\n",
    "        uniqueVal = set(featureList)       #用set去重\n",
    "        newEntropy = 0.0\n",
    "        for value in uniqueVal:  \n",
    "            subDataSet = splitDataSet(dataSet, i, value)#对第i个特征，针对某个值划分\n",
    "            prob = len(subDataSet)/float(len(dataSet))\n",
    "            newEntropy += prob * Entropy(subDataSet)   #累加信息熵  \n",
    "        infoGain = baseEntropy - newEntropy     #计算这次划分的信息增益\n",
    "        if (infoGain > bestInfoGain):       \n",
    "            bestInfoGain = infoGain         #替换好的值\n",
    "            bestFeature = i\n",
    "    return bestFeature                      #return 最好的划分特征下标i\n",
    "\n",
    "\n",
    "\n",
    "#测试下\n",
    "index = chooseBestFeatureToSplit(myData)\n",
    "print \"best feature shoule be 0\\n\",index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the majorityClassCount is 2\n2\n"
     ]
    }
   ],
   "source": [
    "import operator\n",
    "\n",
    "#对字典排序，取得最大值:将多数的类别标签作为“叶子节点”的类别\n",
    "def majorityCnt(classList):\n",
    "    classCount={}\n",
    "    for vote in classList:\n",
    "        if vote not in classCount.keys(): classCount[vote] = 0\n",
    "        classCount[vote] += 1\n",
    "    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)\n",
    "    #sorted(classCount.items(), key=lambda x:x[1], reverse=True) #对字典排序\n",
    "    return sortedClassCount[0][0] #返回多数的那个类别\n",
    "\n",
    "#测试\n",
    "c = [1,1,1,0,0,2,2,2,2]\n",
    "print \"the majorityClassCount is 2\\n\",majorityCnt(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}\n['flippers']\n"
     ]
    }
   ],
   "source": [
    "#根据数据集合，创建一个完整的决策树\n",
    "\n",
    "def createTree(dataSet,featureNames):\n",
    "    classList = [example[-1] for example in dataSet]\n",
    "    if classList.count(classList[0]) == len(classList): #如果所有的类都一样，第0类的个数==长度\n",
    "        return classList[0]#stop splitting when all of the classes are equal\n",
    "    if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet\n",
    "        return majorityCnt(classList) #如果所有的特征都被遍历用于划分数据\n",
    "    bestFeature = chooseBestFeatureToSplit(dataSet)\n",
    "    bestFeatureName = featureNames[bestFeature]\n",
    "    myTree = {bestFeatureName:{}} #用字典存储树结构\n",
    "    del(featureNames[bestFeature]) #从特征名称列表中删除这个bestFeatureName\n",
    "    featureValues = [example[bestFeature] for example in dataSet]  #遍历最好的特征的取值，进行划分\n",
    "    uniqueVals = set(featureValues)\n",
    "    for value in uniqueVals:\n",
    "        subNames = featureNames[:]  #copy all of featureName, 为了保留原有的featureName不被函数修改\n",
    "        myTree[bestFeatureName][value] = \\\n",
    "            createTree(splitDataSet(dataSet, bestFeature, value),subNames) #返回的是一个字典\n",
    "    return myTree \n",
    "\n",
    "\n",
    "\n",
    "#测试一下 \n",
    "myData,myFeatureNames = createDataSet()\n",
    "myTree = createTree(myData,myFeatureNames)\n",
    "print myTree\n",
    "print myFeatureNames  #最后的list，会被改变(！！！因为函数参数是list,参数是按照引用的方式传递的)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1,0] should be classify as 'no'\nno\n[1,1] should be classify as 'yes'\nyes\n"
     ]
    }
   ],
   "source": [
    "#测试算法：对于某个输入变量x，使用决策树，进行分类\n",
    "\n",
    "def classify(inputTree,featureNames,testVec):\n",
    "    firstKey = inputTree.keys()[0] #第一个key,根节点,k=某个特征a的名称\n",
    "    secondDict = inputTree[firstKey] #第二个字典,key是特征a的所有取值\n",
    "    featureIndex = featureNames.index(firstKey)\n",
    "    key = testVec[featureIndex] \n",
    "    valueOfFeature = secondDict[key] #获得叶子节点的值，要么是“类别标签”，要么是key=某个特征b的“字典”\n",
    "    if isinstance(valueOfFeature, dict): #判断实例是否是类型(tuple,dict,int,float) \n",
    "        classLabel = classify(valueOfFeature, featureNames, testVec) #是字典，继续递归\n",
    "    else: classLabel = valueOfFeature #是类别标签，直接返回\n",
    "    return classLabel\n",
    "\n",
    "myData,myFeatureNames = createDataSet() #因为myFeatureNames在函数中，改变了，所以重新加载\n",
    "class0 = classify(myTree,myFeatureNames,[1,0])\n",
    "print \"[1,0] should be classify as 'no'\\n\",class0\n",
    "\n",
    "class1 = classify(myTree,myFeatureNames,[1,1])\n",
    "print \"[1,1] should be classify as 'yes'\\n\",class1\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "store decisionTree/myTree.txt successfully!\n{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}\n"
     ]
    }
   ],
   "source": [
    "#使用pickle模块保存决策树参数和结构\n",
    "\n",
    "import pickle\n",
    "\n",
    "def storeTree(inputTree,filename):\n",
    "    fw = open(filename,'w')\n",
    "    pickle.dump(inputTree,fw) #序列化对象，保存到磁盘\n",
    "    print \"store %s successfully!\"%(filename)\n",
    "    fw.close()\n",
    "    \n",
    "def getTree(filename):\n",
    "    fr = open(filename) \n",
    "    return pickle.load(fr) #从磁盘读取,从file中读取一个字符串，并将它重构为原来的python对象\n",
    "\n",
    "filename = \"decisionTree/myTree.txt\"\n",
    "storeTree(myTree,filename)\n",
    "tree = getTree(filename)\n",
    "print tree\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
